Data Exploration and Preparing for Modelling
# Load necessary libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(lubridate)
library(skimr)
library(timetk)
## Warning: package 'timetk' was built under R version 4.3.2
##
## Attaching package: 'timetk'
##
## The following object is masked from 'package:data.table':
##
## :=
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
##
## Attaching package: 'h2o'
##
## The following objects are masked from 'package:data.table':
##
## hour, month, week, year
##
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:stats':
##
## cor, sd, var
##
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## Warning: package 'dials' was built under R version 4.3.2
## Warning: package 'infer' was built under R version 4.3.2
## Warning: package 'modeldata' was built under R version 4.3.2
## Warning: package 'parsnip' was built under R version 4.3.2
## Warning: package 'rsample' was built under R version 4.3.2
## Warning: package 'tune' was built under R version 4.3.2
## Warning: package 'workflows' was built under R version 4.3.2
## Warning: package 'workflowsets' was built under R version 4.3.2
## Warning: package 'yardstick' was built under R version 4.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ data.table::first() masks dplyr::first()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ data.table::last() masks dplyr::last()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## ✖ data.table::transpose() masks purrr::transpose()
## • Learn how to get started at https://www.tidymodels.org/start/
library(modeltime)
## Warning: package 'modeltime' was built under R version 4.3.2
##
## Attaching package: 'modeltime'
##
## The following object is masked from 'package:data.table':
##
## :=
# Check the current working directory
getwd()
## [1] "C:/Users/ACER/Desktop/R github portfolio"
# Load and explore the dataset
data <- read.csv("airpassengers.csv")
# Data Exploration
# Display the structure of the data
str(data)
## 'data.frame': 144 obs. of 2 variables:
## $ Month : chr "1949-01" "1949-02" "1949-03" "1949-04" ...
## $ X.Passengers: int 112 118 132 129 121 135 148 148 136 119 ...
# Visualize the data in the RStudio Viewer
View(data)
# Rename columns
colnames(data) <- c('Date', 'Count')
# Display the structure of the data after renaming columns
data %>% glimpse()
## Rows: 144
## Columns: 2
## $ Date <chr> "1949-01", "1949-02", "1949-03", "1949-04", "1949-05", "1949-06"…
## $ Count <int> 112, 118, 132, 129, 121, 135, 148, 148, 136, 119, 104, 118, 115,…
# Preprocess date column
data$Date <- paste(data$Date, "-01", sep="")
data$Date <- as.Date(data$Date, format="%Y-%m-%d")
# Display the structure of the data after preprocessing date column
str(data)
## 'data.frame': 144 obs. of 2 variables:
## $ Date : Date, format: "1949-01-01" "1949-02-01" ...
## $ Count: int 112 118 132 129 121 135 148 148 136 119 ...
# Visualize time series data
# Assuming `interactive` is a logical variable
data %>% plot_time_series(Date, Count, .interactive = TRUE)
# Time series splitting
splits <- initial_time_split(data, prop = 0.9)
Model Evaluation
# Compare RMSE score on test data
predictions_arima_boosted <- predict(model_fit_arima_boosted, new_data = testing(splits)) %>%
bind_cols(testing(splits))
predictions_prophet <- predict(model_fit_prophet, new_data = testing(splits)) %>%
bind_cols(testing(splits))
predictions_ets <- predict(model_fit_ets, new_data = testing(splits)) %>%
bind_cols(testing(splits))
rmse_arima_boosted <- sqrt(mean((predictions_arima_boosted$Count - predictions_arima_boosted$.pred)^2))
rmse_prophet <- sqrt(mean((predictions_prophet$Count - predictions_prophet$.pred)^2))
rmse_ets <- sqrt(mean((predictions_ets$Count - predictions_ets$.pred)^2))
cat("RMSE for arima_boosted:", rmse_arima_boosted, "\n")
## RMSE for arima_boosted: 19.35821
cat("RMSE for prophet:", rmse_prophet, "\n")
## RMSE for prophet: 39.10063
cat("RMSE for ets:", rmse_ets, "\n")
## RMSE for ets: 26.07049
# Model Selection and Forecasting
# Make forecast on the model with the lowest RMSE score
lowest_rmse_model <- which.min(c(rmse_arima_boosted, rmse_prophet, rmse_ets))
if (lowest_rmse_model == 1) {
final_forecast <- predict(model_fit_arima_boosted, new_data = data) %>%
bind_cols(data)
} else if (lowest_rmse_model == 2) {
final_forecast <- predict(model_fit_prophet, new_data = data) %>%
bind_cols(data)
} else {
final_forecast <- predict(model_fit_ets, new_data = data) %>%
bind_cols(data)
}
# Visualize past data and forecast values on one plot with different colors
ggplot() +
geom_line(data = final_forecast, aes(x = Date, y = .pred, color = "Forecast"), linetype = "dashed") +
geom_line(data = data, aes(x = Date, y = Count, color = "Actual")) +
labs(title = "Past Data and Forecast Values",
x = "Date",
y = "Count") +
scale_color_manual(values = c("Actual" = "blue", "Forecast" = "red"), guide = "legend") +
theme_minimal()
